knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.11     ✔ rsample      1.3.1 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(reportRmd)
## systemfonts and textshaping have been compiled with different versions of Freetype. Because of this, textshaping will not use the font cache provided by systemfonts
library(sjPlot)
## 
## Attaching package: 'sjPlot'
## 
## The following object is masked from 'package:ggplot2':
## 
##     set_theme
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(parallel)
library(finalfit)
## 
## Attaching package: 'finalfit'
## 
## The following object is masked from 'package:reportRmd':
## 
##     extract_labels
library(gtsummary)
library(mlbench)
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(rsample)
library(tune)
library(recipes)
library(yardstick)
library(parsnip)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10
library(themis)
library(corrr)
library(performance)
## 
## Attaching package: 'performance'
## 
## The following objects are masked from 'package:yardstick':
## 
##     mae, rmse
library(utils)
library(see)

Load Data

data_1 <- read_csv("canpath_imputed.csv")
## Rows: 41187 Columns: 93
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ID
## dbl (92): ADM_STUDY_ID, SDC_GENDER, SDC_AGE_CALC, SDC_MARITAL_STATUS, SDC_ED...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data_1)
##  [1] "ID"                        "ADM_STUDY_ID"             
##  [3] "SDC_GENDER"                "SDC_AGE_CALC"             
##  [5] "SDC_MARITAL_STATUS"        "SDC_EDU_LEVEL"            
##  [7] "SDC_EDU_LEVEL_AGE"         "SDC_INCOME"               
##  [9] "SDC_INCOME_IND_NB"         "SDC_HOUSEHOLD_ADULTS_NB"  
## [11] "SDC_HOUSEHOLD_CHILDREN_NB" "HS_GEN_HEALTH"            
## [13] "PA_TOTAL_SHORT"            "PM_BMI_SR"                
## [15] "HS_ROUTINE_VISIT_EVER"     "HS_DENTAL_VISIT_EVER"     
## [17] "HS_FOBT_EVER"              "HS_COL_EVER"              
## [19] "HS_SIG_EVER"               "HS_SIG_COL_EVER"          
## [21] "HS_POLYP_EVER"             "HS_PSA_EVER"              
## [23] "WH_CONTRACEPTIVES_EVER"    "WH_HFT_EVER"              
## [25] "WH_MENOPAUSE_EVER"         "WH_HRT_EVER"              
## [27] "WH_HYSTERECTOMY_EVER"      "WH_OOPHORECTOMY_EVER"     
## [29] "HS_MMG_EVER"               "HS_PAP_EVER"              
## [31] "DIS_HBP_EVER"              "DIS_MI_EVER"              
## [33] "DIS_STROKE_EVER"           "DIS_ASTHMA_EVER"          
## [35] "DIS_COPD_EVER"             "DIS_DEP_EVER"             
## [37] "DIS_DIAB_EVER"             "DIS_LC_EVER"              
## [39] "DIS_CH_EVER"               "DIS_CROHN_EVER"           
## [41] "DIS_UC_EVER"               "DIS_IBS_EVER"             
## [43] "DIS_ECZEMA_EVER"           "DIS_SLE_EVER"             
## [45] "DIS_PS_EVER"               "DIS_MS_EVER"              
## [47] "DIS_OP_EVER"               "DIS_ARTHRITIS_EVER"       
## [49] "DIS_CANCER_EVER"           "DIS_HBP_FAM_EVER"         
## [51] "DIS_MI_FAM_EVER"           "DIS_STROKE_FAM_EVER"      
## [53] "DIS_ASTHMA_FAM_EVER"       "DIS_COPD_FAM_EVER"        
## [55] "DIS_DEP_FAM_EVER"          "DIS_DIAB_FAM_EVER"        
## [57] "DIS_LC_FAM_EVER"           "DIS_CH_FAM_EVER"          
## [59] "DIS_CROHN_FAM_EVER"        "DIS_UC_FAM_EVER"          
## [61] "DIS_IBS_FAM_EVER"          "DIS_ECZEMA_FAM_EVER"      
## [63] "DIS_SLE_FAM_EVER"          "DIS_PS_FAM_EVER"          
## [65] "DIS_MS_FAM_EVER"           "DIS_OP_FAM_EVER"          
## [67] "DIS_ARTHRITIS_FAM_EVER"    "DIS_CANCER_FAM_EVER"      
## [69] "DIS_CANCER_F_EVER"         "DIS_CANCER_M_EVER"        
## [71] "DIS_CANCER_SIB_EVER"       "DIS_CANCER_CHILD_EVER"    
## [73] "ALC_EVER"                  "SMK_CIG_EVER"             
## [75] "SMK_CIG_WHOLE_EVER"        "DIS_ENDO_HB_CHOL_EVER"    
## [77] "DIS_CARDIO_HD_EVER"        "DIS_RESP_SLEEP_APNEA_EVER"
## [79] "DIS_MH_ANXIETY_EVER"       "DIS_MH_ADDICTION_EVER"    
## [81] "DIS_NEURO_MIGRAINE_EVER"   "PSE_ADULT_WRK_DURATION"   
## [83] "PSE_WRK_FREQ"              "WRK_FULL_TIME"            
## [85] "WRK_PART_TIME"             "WRK_RETIREMENT"           
## [87] "WRK_HOME_FAMILY"           "WRK_UNABLE"               
## [89] "WRK_UNEMPLOYED"            "WRK_UNPAID"               
## [91] "WRK_STUDENT"               "WRK_EMPLOYMENT"           
## [93] "WRK_SCHEDULE_CUR_CAT"
summary(data_1$SDC_EDU_LEVEL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   4.000   4.522   6.000   7.000
# check missing in education level
sum(is.na(data_1$SDC_EDU_LEVEL))
## [1] 0

Data Preprocessing

data <- data_1 |> select(ID,
                       SDC_AGE_CALC, 
                       PA_TOTAL_SHORT, 
                       PM_BMI_SR, 
                       SDC_EDU_LEVEL_AGE, 
                       PSE_ADULT_WRK_DURATION, 
                       SDC_INCOME,
                       SDC_GENDER)

data$SDC_INCOME <- as.factor(data_1$SDC_INCOME)
data$SDC_GENDER <- as.factor(data_1$SDC_GENDER)

Preliminary analysis

Summary Statistics

rm_covsum(data=data, 
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION'))
n=41187
SDC AGE CALC
Mean (sd) 51.5 (10.8)
Median (Min,Max) 52 (30, 74)
PA TOTAL SHORT
Mean (sd) 2606.5 (2691.8)
Median (Min,Max) 1800 (0, 19278)
PM BMI SR
Mean (sd) 27.7 (6.2)
Median (Min,Max) 26.6 (8.9, 69.4)
SDC EDU LEVEL AGE
Mean (sd) 25.3 (9.2)
Median (Min,Max) 23 (-7, 73)
PSE ADULT WRK DURATION
Mean (sd) 6.7 (9.5)
Median (Min,Max) 2 (0, 51)
## Detailed summary statistics???

rm_covsum(data=data,
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION'))
n=41187
SDC AGE CALC
Mean (sd) 51.5 (10.8)
Median (Min,Max) 52 (30, 74)
PA TOTAL SHORT
Mean (sd) 2606.5 (2691.8)
Median (Min,Max) 1800 (0, 19278)
PM BMI SR
Mean (sd) 27.7 (6.2)
Median (Min,Max) 26.6 (8.9, 69.4)
SDC EDU LEVEL AGE
Mean (sd) 25.3 (9.2)
Median (Min,Max) 23 (-7, 73)
PSE ADULT WRK DURATION
Mean (sd) 6.7 (9.5)
Median (Min,Max) 2 (0, 51)

Correlations

data %>% 
  correlate() %>%
  rearrange() %>%
  shave()  %>%
  rplot(print_cor=TRUE)
## Non-numeric variables removed from input: `ID`, `SDC_INCOME`, and `SDC_GENDER`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the corrr package.
##   Please report the issue at <https://github.com/tidymodels/corrr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

pca_recipe <- recipe(~., data = data) %>%
  update_role(ID, SDC_INCOME, SDC_GENDER, new_role = "id") %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_pca(all_predictors(), id = "pca_id")

pca_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## predictor: 5
## id:        3
## 
## ── Operations
## • Scaling for: all_predictors()
## • Centering for: all_predictors()
## • PCA extraction with: all_predictors()
pca_prepared <- prep(pca_recipe)

pca_prepared
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## predictor: 5
## id:        3
## 
## ── Training information
## Training data contained 41187 data points and no incomplete rows.
## 
## ── Operations
## • Scaling for: SDC_AGE_CALC, PA_TOTAL_SHORT, PM_BMI_SR, ... | Trained
## • Centering for: SDC_AGE_CALC, PA_TOTAL_SHORT, PM_BMI_SR, ... | Trained
## • PCA extraction with: SDC_AGE_CALC PA_TOTAL_SHORT, ... | Trained
pca_baked <- bake(pca_prepared, data)
pca_baked
## # A tibble: 41,187 × 8
##    ID         SDC_INCOME SDC_GENDER    PC1     PC2    PC3      PC4    PC5
##    <chr>      <fct>      <fct>       <dbl>   <dbl>  <dbl>    <dbl>  <dbl>
##  1 SYN_58621  6          2           0.226 -0.183   0.873 -0.161   -0.770
##  2 SYN_58622  6          2          -0.372  0.719  -0.184 -0.565    0.912
##  3 SYN_58623  4          2           0.792 -1.37    1.35   0.840    0.974
##  4 SYN_58624  3          2          -0.526  0.0250  0.418 -1.16     1.08 
##  5 SYN_58625  4          2           1.45   1.80   -1.75   1.69     0.578
##  6 SYN_58626  4          2          -0.987 -1.62    1.14   1.10    -0.154
##  7 SYN_58627  5          2          -1.65  -0.294   0.240 -0.133   -0.466
##  8 SYN_58628  3          2           0.252  1.04   -0.707 -0.0320   1.20 
##  9 SYN_58629  3          2           0.771 -0.578  -0.915  2.20     0.645
## 10 SYN_586210 5          2           0.241 -0.587  -1.47   0.00528  0.747
## # ℹ 41,177 more rows
pca_variables <- tidy(pca_prepared, id = "pca_id", type = "coef")

ggplot(pca_variables) +
  geom_point(aes(x = value, y = terms, color = component))+
  labs(color = NULL) +
  geom_vline(xintercept=0) + 
  geom_vline(xintercept=-0.2, linetype = 'dashed') + 
  geom_vline(xintercept=0.2, linetype = 'dashed') + 
  facet_wrap(~ component) +
  theme_minimal()

pca_variances <- tidy(pca_prepared, id = "pca_id", type = "variance")

pca_variance <- pca_variances |> filter(terms == "percent variance")
pca_variance$component <- as.factor(pca_variance$component)
pca_variance$comp <- as.numeric(pca_variance$component)

ggplot(pca_variance, aes(x = component, y = value, group = 1, color = component)) +
  geom_point() +
  geom_line() +
  labs(x = "Principal Components", y = "Variance explained (%)") +
  theme_minimal()

pca_cummul_variance <- pca_variances |> filter(terms == "cumulative percent variance")
pca_cummul_variance$component <- as.factor(pca_cummul_variance$component)
pca_cummul_variance$comp <- as.numeric(pca_cummul_variance$component)

ggplot(pca_cummul_variance, aes(x = component, y = value, group = 1, color = component)) +
  geom_point() +
  geom_line() +
  labs(x = "Principal Components", y = "Cummulative Variance explained (%)") +
  theme_minimal()

pca_corr <- pca_baked |> select(!(ID))

pca_corr %>% 
  correlate() %>%
  rearrange() %>%
  shave()  %>%
  rplot(print_cor=TRUE)
## Non-numeric variables removed from input: `SDC_INCOME`, and `SDC_GENDER`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'

ggplot(pca_baked, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method = "lm") +
  labs(x = "PC1", y = "PC2") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

scatter_3d <- plot_ly(pca_baked, x = ~PC3, y = ~PC4, z = ~PC5, type = "scatter3d", mode = "markers",
                  marker = list(size = 3)) %>%
                  layout(title = "3D Scatter Plot",
                         scene = list(xaxis = list(title = "PC3"),
                                      yaxis = list(title = "PC4"),
                                      zaxis = list(title = "PC5")))

# Display the 3D scatter plot
scatter_3d